This mini-project explores summary data from a clinical trial on Dapagliflozin (NCT01400884) in type 2 diabetes. It includes data cleaning, statistical summaries, AE profiling, survival analysis, and interactive visualizations, mimicking real-world workflows in pharmacovigilance and clinical reporting.
Data Handling: tidyverse, janitor, here
Reporting: gtsummary, gt, Tplyr
Visualization: ggplot2, patchwork, plotly, survminer
Interactivity: DT, shiny
Clinical Packages: admiral, survival
In this phase, we load and preprocess the clinical trial data. The focus is on standardizing variable names, correcting data types, and formatting the datasets for downstream analysis.
# Load libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(here)
## Warning: package 'here' was built under R version 4.4.3
## here() starts at C:/Users/simranpreet/OneDrive - Nottingham Trent University
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.4.3
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.4.3
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.4.3
library(gt)
## Warning: package 'gt' was built under R version 4.4.3
library(flextable)
## Warning: package 'flextable' was built under R version 4.4.3
##
## Attaching package: 'flextable'
##
## The following objects are masked from 'package:ggpubr':
##
## border, font, rotate
##
## The following object is masked from 'package:gtsummary':
##
## continuous_summary
##
## The following object is masked from 'package:purrr':
##
## compose
library(shiny)
## Warning: package 'shiny' was built under R version 4.4.3
library(DT)
## Warning: package 'DT' was built under R version 4.4.3
##
## Attaching package: 'DT'
##
## The following objects are masked from 'package:shiny':
##
## dataTableOutput, renderDataTable
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
##
## Attaching package: 'plotly'
##
## The following objects are masked from 'package:flextable':
##
## highlight, style
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(cardx)
## Warning: package 'cardx' was built under R version 4.4.3
library(rmarkdown)
## Warning: package 'rmarkdown' was built under R version 4.4.3
library(officer)
## Warning: package 'officer' was built under R version 4.4.3
library(webshot)
## Warning: package 'webshot' was built under R version 4.4.3
library(tinytex)
## Warning: package 'tinytex' was built under R version 4.4.3
library(survival)
## Warning: package 'survival' was built under R version 4.4.3
library(survminer)
## Warning: package 'survminer' was built under R version 4.4.3
##
## Attaching package: 'survminer'
##
## The following object is masked from 'package:survival':
##
## myeloma
library(admiral)
## Warning: package 'admiral' was built under R version 4.4.3
library(Tplyr)
## Warning: package 'Tplyr' was built under R version 4.4.3
library(remotes)
## Warning: package 'remotes' was built under R version 4.4.3
library(bslib)
## Warning: package 'bslib' was built under R version 4.4.3
##
## Attaching package: 'bslib'
##
## The following object is masked from 'package:utils':
##
## page
# Load and clean datasets
baseline <- read_csv(here("C:/Users/simranpreet/OneDrive - Nottingham Trent University/clinical_trial_NCT01400884/clinical-trial-summary-real/data", "baseline.csv"), show_col_types = FALSE) %>% clean_names()
ae_summary <- read_csv(here("C:/Users/simranpreet/OneDrive - Nottingham Trent University/clinical_trial_NCT01400884/clinical-trial-summary-real/data", "ae_summary.csv"), show_col_types = FALSE) %>% clean_names()
# Clean baseline data
baseline_clean <- baseline %>%
mutate(
treatment_arm = factor(treatment_arm),
sex_male_percent = as.numeric(sex_male_percent),
sex_female_percent = as.numeric(sex_female_percent),
bmi_mean = round(bmi_mean, 1)
) %>%
select(treatment_arm, n, age_mean, age_sd, sex_male_percent, sex_female_percent, bmi_mean)
# Clean AE data
ae_clean <- ae_summary %>%
mutate(
treatment_arm = factor(treatment_arm),
ae_category = str_to_title(ae_category),
ae_severity = factor(ae_severity, levels = c("Mild", "Moderate", "Severe")),
percent_events = round(percent_percent, 1)
) %>%
select(treatment_arm, ae_category, ae_severity, n_events, percent_events)
# Save cleaned files
if (!dir.exists(here("outputs", "cleaned"))) {
dir.create(here("outputs", "cleaned"), recursive = TRUE)
}
write_csv(baseline_clean, here("outputs", "cleaned", "baseline_clean.csv"))
write_csv(ae_clean, here("outputs", "cleaned", "ae_summary_clean.csv"))
This phase transforms cleaned data into meaningful statistical tables and plots to summarize baseline characteristics, adverse events, and treatment outcomes.
Objectives: - Compare demographics (age, BMI) across treatment arms. - Explore adverse event distributions by severity. - Generate Kaplan-Meier survival curves.
# Load cleaned data
baseline_clean <- read_csv(here("outputs", "cleaned", "baseline_clean.csv"), show_col_types = FALSE)
ae_clean <- read_csv(here("outputs", "cleaned", "ae_summary_clean.csv"), show_col_types = FALSE)
# Summary statistics with gtsummary
baseline_clean %>%
tbl_summary(by = treatment_arm,
statistic = list(all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)")) %>%
add_p() %>%
modify_header(label ~ "**Variable**") %>%
bold_labels()
| Variable | Dapagliflozin 10mg N = 11 |
Dapagliflozin 5mg N = 11 |
Placebo N = 11 |
p-value2 |
|---|---|---|---|---|
| n | >0.9 | |||
| Â Â Â Â 97 | 1 (100%) | 0 (0%) | 0 (0%) | |
| Â Â Â Â 98 | 0 (0%) | 1 (100%) | 0 (0%) | |
| Â Â Â Â 100 | 0 (0%) | 0 (0%) | 1 (100%) | |
| age_mean | >0.9 | |||
| Â Â Â Â 57.6 | 1 (100%) | 0 (0%) | 0 (0%) | |
| Â Â Â Â 58.2 | 0 (0%) | 0 (0%) | 1 (100%) | |
| Â Â Â Â 59.1 | 0 (0%) | 1 (100%) | 0 (0%) | |
| age_sd | >0.9 | |||
| Â Â Â Â 8.7 | 0 (0%) | 1 (100%) | 0 (0%) | |
| Â Â Â Â 9.1 | 0 (0%) | 0 (0%) | 1 (100%) | |
| Â Â Â Â 9.3 | 1 (100%) | 0 (0%) | 0 (0%) | |
| sex_male_percent | >0.9 | |||
| Â Â Â Â 55 | 0 (0%) | 0 (0%) | 1 (100%) | |
| Â Â Â Â 58 | 1 (100%) | 0 (0%) | 0 (0%) | |
| Â Â Â Â 60 | 0 (0%) | 1 (100%) | 0 (0%) | |
| sex_female_percent | >0.9 | |||
| Â Â Â Â 40 | 0 (0%) | 1 (100%) | 0 (0%) | |
| Â Â Â Â 42 | 1 (100%) | 0 (0%) | 0 (0%) | |
| Â Â Â Â 45 | 0 (0%) | 0 (0%) | 1 (100%) | |
| bmi_mean | >0.9 | |||
| Â Â Â Â 30.9 | 1 (100%) | 0 (0%) | 0 (0%) | |
| Â Â Â Â 31.5 | 0 (0%) | 0 (0%) | 1 (100%) | |
| Â Â Â Â 32.1 | 0 (0%) | 1 (100%) | 0 (0%) | |
| 1 n (%) | ||||
| 2 Fisher’s exact test | ||||
# Simulate survival data (replace with real data if available)
set.seed(100)
survival_data <- tibble(
treatment_arm = rep(c("A", "B"), each = 50),
time = round(rexp(100, rate = 0.1), 1),
status = sample(0:1, 100, replace = TRUE)
)
# Kaplan-Meier plot
km_fit <- survfit(Surv(time, status) ~ treatment_arm, data = survival_data)
km_plot <- ggsurvplot(
km_fit,
data = survival_data,
pval = TRUE,
conf.int = TRUE,
risk.table = TRUE,
title = "Kaplan-Meier Survival by Treatment Arm",
xlab = "Time (days)",
ylab = "Survival Probability",
legend.title = "Treatment"
)
km_plot
# Chi-square test
chisq_table <- ae_clean %>%
count(treatment_arm, ae_category) %>%
pivot_wider(names_from = ae_category, values_from = n, values_fill = 0)
chisq.test(chisq_table[,-1])
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: chisq_table[, -1]
## X-squared = 0, df = 2, p-value = 1
chisq_table
## # A tibble: 3 × 3
## treatment_arm Headache Nausea
## <chr> <int> <int>
## 1 Dapagliflozin 10mg 1 1
## 2 Dapagliflozin 5mg 1 1
## 3 Placebo 1 1
fisher.test(chisq_table[,-1])
##
## Fisher's Exact Test for Count Data
##
## data: chisq_table[, -1]
## p-value = 1
## alternative hypothesis: two.sided
# AE plot
ae_plot <- ggplot(ae_clean, aes(x = ae_category, y = n_events, fill = ae_severity)) +
geom_col(position = "dodge") +
facet_wrap(~ treatment_arm) +
labs(title = "Adverse Events by Treatment Arm", x = "AE Category", y = "Number of Events") +
theme_minimal()
ae_plot
# Patchwork layout for age and BMI
age_plot <- ggplot(baseline_clean, aes(x = treatment_arm, y = age_mean)) +
geom_col(fill = "skyblue") +
labs(title = "Mean Age by Treatment Arm", y = "Mean Age")
bmi_plot <- ggplot(baseline_clean, aes(x = treatment_arm, y = bmi_mean)) +
geom_col(fill = "orange") +
labs(title = "Mean BMI by Treatment Arm", y = "Mean BMI")
age_plot + bmi_plot
# Flextable report
baseline_clean %>%
group_by(treatment_arm) %>%
summarise(
N = sum(n),
Age_Mean = round(mean(age_mean, na.rm = TRUE), 1),
BMI_Mean = round(mean(bmi_mean, na.rm = TRUE), 1),
Male_Percent = round(mean(sex_male_percent, na.rm = TRUE), 1),
Female_Percent = round(mean(sex_female_percent, na.rm = TRUE), 1)
) %>%
flextable() %>%
set_header_labels(
treatment_arm = "Treatment Arm",
N = "Sample Size",
Age_Mean = "Mean Age",
BMI_Mean = "Mean BMI",
Male_Percent = "% Male",
Female_Percent = "% Female"
) %>%
autofit()
Treatment Arm | Sample Size | Mean Age | Mean BMI | % Male | % Female |
|---|---|---|---|---|---|
Dapagliflozin 10mg | 97 | 57.6 | 30.9 | 58 | 42 |
Dapagliflozin 5mg | 98 | 59.1 | 32.1 | 60 | 40 |
Placebo | 100 | 58.2 | 31.5 | 55 | 45 |
#tplyr table
tplyr_table <- tplyr_table(baseline_clean, treatment_arm) %>%
add_layer(
group_desc(age_mean, by = "Age (Mean ± SD)")
) %>%
add_layer(
group_desc(bmi_mean, by = "BMI (Mean ± SD)")
) %>%
build()
# Print table
print(tplyr_table)
## # A tibble: 12 × 8
## row_label1 row_label2 `var1_Dapagliflozin 10mg` `var1_Dapagliflozin 5mg`
## <chr> <chr> <chr> <chr>
## 1 Age (Mean ± SD) n " 1" " 1"
## 2 Age (Mean ± SD) Mean (SD) "57.60 ( )" "59.10 ( )"
## 3 Age (Mean ± SD) Median "57.60" "59.10"
## 4 Age (Mean ± SD) Q1, Q3 "57.60, 57.60" "59.10, 59.10"
## 5 Age (Mean ± SD) Min, Max "57.6, 57.6" "59.1, 59.1"
## 6 Age (Mean ± SD) Missing " 0" " 0"
## 7 BMI (Mean ± SD) n " 1" " 1"
## 8 BMI (Mean ± SD) Mean (SD) "30.90 ( )" "32.10 ( )"
## 9 BMI (Mean ± SD) Median "30.90" "32.10"
## 10 BMI (Mean ± SD) Q1, Q3 "30.90, 30.90" "32.10, 32.10"
## 11 BMI (Mean ± SD) Min, Max "30.9, 30.9" "32.1, 32.1"
## 12 BMI (Mean ± SD) Missing " 0" " 0"
## # ℹ 4 more variables: var1_Placebo <chr>, ord_layer_index <int>,
## # ord_layer_1 <int>, ord_layer_2 <int>
To explore the interactive dashboard for the clinical trial summary,
please open and run the app.R file separately in
RStudio.
The dashboard includes: - Baseline data explorer by treatment arm - Adverse Events bar charts - Biomarker violin plots - Kaplan-Meier survival curves
This interactive RMarkdown report summarizes clinical trial findings with embedded visualizations and tables. Use the tabs to explore each section.
# Simulated biomarker data
set.seed(123)
biomarker_data <- expand.grid(
treatment_arm = c("Placebo", "Dapagliflozin 5mg", "Dapagliflazin 10mg"),
subject_id = 1:50
) %>%
mutate(
responder_status = sample(c("Responder", "Non-Responder"), n(), replace = TRUE),
biomarker_level = round(rnorm(n(), mean = ifelse(responder_status == "Responder", 5.5, 4.2), sd = 1), 2)
)
# Simulated baseline data
baseline_clean <- data.frame(
treatment_arm = sample(c("Placebo","Dapagliflozin 5mg", "Dapagliflazin 10mg"), 150, replace = TRUE),
age_mean = rnorm(150, mean = 60, sd = 10),
bmi_mean = rnorm(150, mean = 27, sd = 4)
)
# Simulated AE data
ae_clean <- data.frame(
treatment_arm = sample(c("Placebo", "Dapagliflozin 5mg", "Dapagliflazin 10mg"), 100, replace = TRUE),
ae_category = sample(c("Headache", "Nausea", "Fatigue"), 100, replace = TRUE),
n_events = sample(1:10, 100, replace = TRUE),
ae_severity = sample(c("Mild", "Moderate", "Severe"), 100, replace = TRUE)
)
# Simulated survival data
surv_data <- data.frame(
time = rexp(150, 0.1),
status = sample(0:1, 150, replace = TRUE),
treatment = baseline_clean$treatment_arm)
DT::datatable(baseline_clean,
options = list(pageLength = 5, scrollX = TRUE),
caption = "Interactive Table: Baseline Characteristics")
ae_plot_static <- ggplot(ae_clean, aes(x = ae_category, y = n_events, fill = ae_severity)) +
geom_col(position = "dodge") +
facet_wrap(~ treatment_arm) +
labs(title = "Adverse Events by Treatment Arm", x = "Adverse Event", y = "Event Count") +
theme_minimal()
plotly::ggplotly(ae_plot_static)
biomarker_violin <- ggplot(biomarker_data, aes(x = responder_status, y = biomarker_level, fill = responder_status)) +
geom_violin(trim = FALSE) +
geom_boxplot(width = 0.1, fill = "white") +
facet_wrap(~ treatment_arm) +
labs(title = "Biomarker Levels by Responder Status", x = "Response", y = "Biomarker Level") +
theme_minimal()
plotly::ggplotly(biomarker_violin)
km_fit <- survfit(Surv(time, status) ~ treatment, data = surv_data)
survminer::ggsurvplot(km_fit, data = surv_data, pval = TRUE, risk.table = TRUE,
ggtheme = theme_minimal(), risk.table.y.text.col = TRUE)
Note: Shiny dashboards require a separate runtime environment and are not rendered directly in this report.
This RMarkdown report simulates a clinical trial summary workflow. It contains high-quality tables, survival plots, and interactive widgets